// bdlde_quotedprintabledecoder.h                                     -*-C++-*-
#ifndef INCLUDED_BDLDE_QUOTEDPRINTABLEDECODER
#define INCLUDED_BDLDE_QUOTEDPRINTABLEDECODER

#include <bsls_ident.h>
BSLS_IDENT("$Id: $")

//@PURPOSE: Provide automata converting to and from Quoted-Printable encodings.
//
//@CLASSES:
//  bdlde::QuotedPrintableDecoder: automata for Quoted-Printable decoding
//
//@SEE_ALSO: bdlde_quotedprintableencoder
//
//@DESCRIPTION: This component provides a template class (parameterized
// separately on both input and output iterators) that can be used to decode
// byte sequences of arbitrary length from the Quoted Printable representation
// described in Section 6.7 "Quoted-Printable Content Transfer Encoding" of RFC
// 2045, "Multipurpose Internet Mail Extensions (MIME) Part One: Format of
// Internet Message Bodies."
//
// Each instance of the decoder retains the state of the conversion from one
// supplied input to the next, enabling the processing of segmented input --
// i.e., processing resumes where it left off with the next invocation on new
// input.  Instance methods are provided for the decoder to (1) assert the end
// of input, (2) determine whether the input so far is currently acceptable,
// and (3) indicate whether a non-recoverable error has occurred.
//
///Quoted-Printable Decoding
///-------------------------
// (In the following, all rules mentioned refer to those listed in the encoder
// section above.)
//
// The decoding process for this encoding scheme involves:
//
// 1. transforming any encoded character triplets back into their original
//    representation (rule #1 and rule #4).
// 2. literally writing out characters that have not been changed (rule #2).
// 3. deleting any trailing whitespace at the end of an encoded line (rule #3).
//
// 4. removing the soft line breaks including the `=` prefix (i.e.,
//    concatenating broken sentences) (rule #5).
//
// The standard imposes a maximum of 76 characters exclusive of CRLF; however,
// the decoder implemented in this component will handle lines of arbitrary
// length.
//
// The decoder also provides support for 2 error-reporting modes: the strict
// mode and the relaxed mode (configurable at construction).  A strict-mode
// decoder stops decoding at the first offending character encountered, while a
// relaxed-mode decoder would continue decoding to the end of the input,
// allowing straight pass-through of character sets that cannot be interpreted.
//
// The following kinds of errors can be encountered during decoding, listed in
// order of decreasing order of precedence:
// ```
// E1. BAD_DATA
// ```
// An `=` character is not followed by either two uppercase hexadecimal digits,
// or a soft line break -- e.g.,
// ```
//  '=4=' (only one hexadecimal)
//  '=K3' (K3 is not a hexadecimal number)
//  '=1f' (lower case f is a literally encoded character)
// ```
//
// Note that:
//
// 1. In the relaxed error-reporting mode of this implementation, lowercase
//    hexadecimal digits are treated as valid numerals.
// 2. E1 can be caused by a missing or corrupted numeric, a corrupted character
//    disguised as an `=`, or an accidental insertion of a `=` that does not
//    belong.
// 3. The case where a seemingly valid character is found in place of a missing
//    numeric cannot be detected, e.g., `=4F` where `F` is actually a literally
//    encoded character.
// 4. An erroneous occurrence of a `=` character preceding 2 seemingly valid
//    hexadecimal numerics is also undetectable, e.g., `=4F` where `=` was
//    actually a `t` corrupted during transmission.
// ```
// E2. BAD_LINEBREAK
// ```
// A '\r' is not followed by a '\n'.  In the relaxed mode, each stand-alone
// '\r' or '\n' will be copied straight through to the output.  For soft line
// breaks, whitespace is ignored between the `=` character and the CRLF as they
// are to be treated and removed as transport padding.
// ```
// E3. BAD_LINELENTH
// ```
// An encoded line exceeds the specified maximum line length with missing soft
// line breaks.  (Because input of flexible line lengths is allowed in this
// implementation, this error is not detected or reported.)
//
// In the relaxed-mode, errors of the types E1 and E2 would be copied straight
// to output and type E3 ignored.  Decoded lines will be broken even when a
// bare CRLF is encountered in this mode.  Users can still be alerted to the
// the unreported errors as offending characters are copied straight through to
// the output stream, which can be observed.
//
// The `isError` method is used to detect the above anomalies, while for the
// `convert` method, a `numIn` output parameter (indicating the number of input
// characters consumed) or possibly the iterator itself (for iterators with
// reference-semantics) identifies the offending character.
//
///Usage
///-----
// This section illustrates intended use of this component.
//
///Example 1: Basic Usage
/// - - - - - - - - - - -
// TBD

#include <bdlscm_version.h>

#include <bsl_cstring.h>
#include <bsl_queue.h>
#include <bsl_vector.h>

namespace BloombergLP {
namespace bdlde {

/// This class implements a mechanism capable of converting data of
/// arbitrary length from its corresponding Quoted-Printable representation.
class QuotedPrintableDecoder {

    // PRIVATE TYPES

    /// Symbolic state values.
    enum {
        e_ERROR_STATE        = -1, // input is irreparably invalid
        e_INPUT_STATE        =  0, // general input state
        e_SAW_EQUAL_STATE    =  1, // need two hexadecimal values or CR LF
        e_SAW_WS_STATE       =  2, // saw a whitespace
        e_NEED_HEX_STATE     =  3, // need one hexadecimal value
        e_NEED_SOFT_LF_STATE =  4, // need soft new line
        e_NEED_HARD_LF_STATE =  5, // need soft new line
        e_DONE_STATE         =  6  // any additional input is an error
    };

  public:

    enum EquivalenceClasses {
        // This enumeration type enumerates the input equivalence classes.
        // Separate enums are given to variants resulting from different modes
        // of operation to eliminate an extra step of mode checking inside the
        // main decoding loop.

                       // Regular character - copy straight to output
        e_RC_ = 0, // strict mode
        e_RC,      // relaxed mode

                       // Hexadecimal digit - numeral only when preceded by
                       // '='; otherwise a regular character
        e_HX_,     // strict mode
        e_HX,      // relaxed mode

                       // '=' - wait for more input
        e_EQ_,     // strict mode
        e_EQ,      // relaxed mode

                       // Whitespace        - buffer; wait for more input
        e_WS_,     // strict mode
        e_WS,      // relaxed mode

                       // Carriage return
        e_CR_,     // strict mode       - wait for further input
        e_CR,      // relaxed mode      - wait for further input

                       // Line Feed Strict mode
                       // ------------
        e_LC_,     // CRLF_MODE         - decode to "\r\n" if preceded by
                       // '\r'; report error otherwise
        e_LL_,     // LF_MODE           - decode to '\n' if preceded by
                       // '\r' report error otherwise Relaxed mode
                       // ------------
        e_LC,      // CRLF_MODE         - decode to "\r\n" if preceded by
                       // '\r'; ignore otherwise
        e_LL,      // LF_MODE           - decode to "\n" if preceded by
                       // '\r'; ignore otherwise

                       // Unrecognized char - halt and report error
        e_UC_,     // strict mode       - Ignore and halt decoding
        e_UC       // relaxed mode      - Ignore but continue decoding
#ifndef BDE_OMIT_INTERNAL_DEPRECATED
      , BDEDE_RC_ = e_RC_
      , BDEDE_RC = e_RC
      , BDEDE_HX_ = e_HX_
      , BDEDE_HX = e_HX
      , BDEDE_EQ_ = e_EQ_
      , BDEDE_EQ = e_EQ
      , BDEDE_WS_ = e_WS_
      , BDEDE_WS = e_WS
      , BDEDE_CR_ = e_CR_
      , BDEDE_CR = e_CR
      , BDEDE_LC_ = e_LC_
      , BDEDE_LL_ = e_LL_
      , BDEDE_LC = e_LC
      , BDEDE_LL = e_LL
      , BDEDE_UC_ = e_UC_
      , BDEDE_UC = e_UC
#endif  // BDE_OMIT_INTERNAL_DEPRECATED
    };

    enum LineBreakMode {
        // Configuration governing how line breaks are decoded.

        e_CRLF_MODE,       // "\r\n" are decoded to "\r\n".
        e_LF_MODE          // "\r\n" are decoded to "\n".
#ifndef BDE_OMIT_INTERNAL_DEPRECATED
      , BDEDE_CRLF_MODE = e_CRLF_MODE
      , BDEDE_LF_MODE = e_LF_MODE
#endif  // BDE_OMIT_INTERNAL_DEPRECATED
    };

    // CLASS DATA

    /// Name of component used when reporting errors.
    static const char s_componentName[];

    /// Default error reporting mode
    static const bool s_defaultUnrecognizedIsErrorFlag;

    /// Default map of `unsigned char` to equivalence class for strict mode
    static const char *s_defaultEquivClassStrict_p;

    /// Default map of `unsigned char` to equivalence class for CRLF line
    /// break mode
    static const char *s_defaultEquivClassCRLF_p;

    /// Character map used for converting an ASCII character to the
    /// hexadecimal value it is representing.
    static const unsigned char *const s_decodingMap_p;

    static const int   s_defaultMaxLineLength; // Default maximum line length
    static const char* s_lineBreakModeName[];  // Names of line break mode

    // INSTANCE DATA
    bool d_unrecognizedIsErrorFlag;  // If true, fail on "bad" characters
    LineBreakMode d_lineBreakMode;   // Line break mode
    int  d_state;         // TBD doc
    char d_buffer[90];    // TBD doc
    int  d_bufferLength;  // TBD doc
    char d_hexBuffer;     // TBD doc
    int  d_outputLength;  // Total number of output characters
    char *d_equivClass_p; // Map of `unsigned char` to input equivalence class;
                          // dynamically allocated because there is no default
                          // complete configuration.

  private:
    // NOT IMPLEMENTED
    QuotedPrintableDecoder(const QuotedPrintableDecoder&);
    QuotedPrintableDecoder& operator=(const QuotedPrintableDecoder&);

  public:
    // CLASS METHODS

    /// Return the ASCII string describing the specified `mode` governing
    /// the decoding of hard linebreaks ("\r\n").  The behavior is undefined
    /// unless `mode` is either e_CRLF_MODE or e_LF_MODE.
    static const char* lineBreakModeToAscii(LineBreakMode mode);

    // CREATORS

    /// Create a Quoted-Printable decoder in the initial state, set to the
    /// strict or relaxed error-reporting mode according to whether the
    /// specified `detectError` flag is `true` or `false`, respectively, and
    /// also configured to the specified `lineBreakMode`.  The behavior is
    /// undefined unless `lineBreakMode` is either e_CRLF_MODE or
    /// e_LF_MODE.  Note that the decoder reports errors in the strict
    /// mode and output offending characters in the relaxed mode.  Hard line
    /// breaks ("\r\n") are decoded to "\r\n" in e_CRLF_MODE (default)
    /// and to '\n' in e_LF_MODE.
    explicit
    QuotedPrintableDecoder(
        bool                                  detectError,
        QuotedPrintableDecoder::LineBreakMode lineBreakMode =
                                          QuotedPrintableDecoder::e_CRLF_MODE);

    /// Destroy this object.
    ~QuotedPrintableDecoder();

    // MANIPULATORS

    /// Append to the buffer addressed by the specified `out` all pending
    /// output (if there is any) up to the optionally specified `maxNumOut`
    /// limit (default is negative, meaning no limit) and, when there is no
    /// pending output and `maxNumOut` is still not reached, begin to
    /// consume and decode a sequence of input characters starting at the
    /// specified `begin` position, up to but not including the specified
    /// `end` position, writing any resulting output in the specified
    /// `output` buffer up to the (cumulative) `maxNumOut` limit.  If
    /// `maxNumOut` limit is reached, no further input will be consumed.
    /// Load into the specified `numOut` and `numIn` the number of output
    /// bytes produced and input bytes consumed, respectively.  Return a
    /// non-negative value on success and a negative value otherwise.  A
    /// successful return status indicates the number of characters that
    /// would be output if `endConvert` were called with no output limit
    /// immediately upon exit from this method.  These bytes are also
    /// available for output if this method is called with a sufficiently
    /// large `maxNumOut`.  Note that calling this method after `endConvert`
    /// has been invoked without an intervening `reset` call will place this
    /// instance in an error state, and return an error status.  Note also
    /// that it is recommended that after all calls to `convert` are
    /// finished, the `endConvert` method be called to complete the decoding
    /// of any unprocessed input characters (e.g., whitespace).
    int convert(char       *out,
                int        *numOut,
                int        *numIn,
                const char *begin,
                const char *end,
                int         maxNumOut = -1);

    /// Terminate encoding for this decoder; write any retained output
    /// (e.g., from a previous call to `convert` with a non-zero `maxNumOut`
    /// argument) to the specified `out` buffer.  Optionally specify the
    /// `maxNumOut` limit on the number of bytes to output; if `maxNumOut`
    /// is negative, no limit is imposed.  Load into the specified `numOut`
    /// the number of output bytes produced.  Return 0 on success with no
    /// pending output, the positive number of bytes (if any) that would be
    /// output if `endConvert` were called with no output limit immediately
    /// upon exit from this method, and a negative value otherwise.  Any
    /// retained bytes are available on a subsequent call to `endConvert`.
    /// Once this method is called, no additional input may be supplied
    /// without an intervening call to `reset`; once this method returns a
    /// zero status, a subsequent call will place this decoder in the error
    /// state, and return an error status.
    int endConvert(char *out, int *numOut, int maxNumOut = -1);

    /// Reset this decoder to its initial state (i.e., as if no input had
    /// been consumed).
    void reset();

    // ACCESSORS

    /// Return `true` if the input read so far by this decoder is considered
    /// syntactically complete and all resulting output has been emitted;
    /// return `false` otherwise.  Note that there must not be any
    /// unprocessed characters accumulated in the input buffer of this
    /// decoder.
    bool isAccepting() const;

    /// Return `true` if this decoder is in the done state (i.e.,
    /// `endConvert` has been called and any additional input will result in
    /// an error), and if there is no pending output; return `false`
    /// otherwise.
    bool isDone() const;

    /// Return `true` if this decoder has encountered an irrecoverable error
    /// and `false` otherwise.  An irrecoverable error is one for which
    /// there is no subsequent possibility of achieving an "acceptable"
    /// result (as defined by the `isAccepting` method).
    bool isError() const;

    /// Return `true` if this decoder is in the initial state (i.e., as if
    /// no input had been consumed) and `false` otherwise.
    bool isInitialState() const;

    /// Return `true` if the input to this decoder is maximal (i.e., the
    /// input contains an end-of-input sentinel, signaling that no further
    /// input should be expected).  *Always* returns `false` for
    /// Quoted-Printable decoders since the encoding scheme does not specify
    /// an end-of-input sentinel.
    bool isMaximal() const;

    /// Return `true` if this decoder is currently configured to detect an
    /// error when an unrecognizable encoding is encountered, and `false`
    /// otherwise.
    bool isUnrecognizedAnError() const;

    /// Return the line break mode specified for this decoder.
    LineBreakMode lineBreakMode() const;

    /// Return the number of output bytes retained by this decoder and not
    /// emitted because `maxNumOut` has been reached.
    int numOutputPending() const;

    /// Return the total length of the output emitted by this decoder
    /// (possibly after several calls to the `convert` or the `input`
    /// methods) since its initial construction or the latest `reset`.
    int outputLength() const;
};

// ============================================================================
//                             INLINE DEFINITIONS
// ============================================================================

// CLASS METHODS
inline
const char* QuotedPrintableDecoder::lineBreakModeToAscii(
    LineBreakMode mode)
{
    return s_lineBreakModeName[mode];
}

// CREATORS
inline
QuotedPrintableDecoder::QuotedPrintableDecoder(
                 bool                                  unrecognizedIsErrorFlag,
                 QuotedPrintableDecoder::LineBreakMode lineBreakMode)
: d_unrecognizedIsErrorFlag(unrecognizedIsErrorFlag)
, d_lineBreakMode(lineBreakMode)
, d_state(e_INPUT_STATE)
, d_bufferLength(0)
, d_outputLength(0)
{
    if (unrecognizedIsErrorFlag) {
        // Strict mode
        d_equivClass_p = const_cast<char*>(s_defaultEquivClassStrict_p);
    }
    else {
        if (lineBreakMode == e_CRLF_MODE) {
            d_equivClass_p = const_cast<char*>(s_defaultEquivClassCRLF_p);
        }
        else {
            // First copy the map of equivalence classes for the
            // e_CRLF_MODE to the strict error-report mode.

            int len = sizeof(*s_defaultEquivClassCRLF_p) * 256;
            d_equivClass_p = new char[len];
            bsl::memcpy(d_equivClass_p, s_defaultEquivClassCRLF_p, len);
            d_equivClass_p['\n'] = e_LL;  // output '\n' instead if preceded
                                          // by '='.
        }
    }
}

// MANIPULATORS
inline
void QuotedPrintableDecoder::reset()
{
    d_state = e_INPUT_STATE;
    d_outputLength = 0;
    d_bufferLength = 0;
}

// ACCESSORS
inline
bool QuotedPrintableDecoder::isAccepting() const
{
    return e_INPUT_STATE == d_state || e_DONE_STATE == d_state;
}

inline
bool QuotedPrintableDecoder::isDone() const
{
    return e_DONE_STATE == d_state && 0 == d_bufferLength;
}

inline
bool QuotedPrintableDecoder::isError() const
{
    return e_ERROR_STATE == d_state;
}

inline
bool QuotedPrintableDecoder::isInitialState() const
{
    return e_INPUT_STATE == d_state && 0 == d_outputLength;
}

inline
bool QuotedPrintableDecoder::isMaximal() const
{
    return false;
}

inline
bool QuotedPrintableDecoder::isUnrecognizedAnError() const
{
    return d_unrecognizedIsErrorFlag;
}

inline
QuotedPrintableDecoder::LineBreakMode
QuotedPrintableDecoder::lineBreakMode() const
{
    return d_lineBreakMode;
}

inline
int QuotedPrintableDecoder::numOutputPending() const
{
    return d_bufferLength;
}

inline
int QuotedPrintableDecoder::outputLength() const
{
    return d_outputLength;
}

}  // close package namespace
}  // close enterprise namespace

#endif

// ----------------------------------------------------------------------------
// Copyright 2015 Bloomberg Finance L.P.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------------------- END-OF-FILE ----------------------------------
